import os
import jieba
from collections import Counter
from wordcloud import WordCloud
import string



#去除标点符号

#打开文件，读取文字内容
class Wordcloud():
    def __init__(self,
                 path,
                 save_path,
                 width,
                 height,
                 max_words,
                 min_length,
                 stop_words,
                 background_color = 'white',
                 font_path = "simhei.ttf",
                 cut_all = True,
                 ):
        self.path = path
        self.save_path = save_path
        self.width = width
        self.height = height
        self.stop_words = stop_words
        self.cut_all = cut_all
        self.max_words = max_words
        self.font_path = font_path
        self.background_color = background_color
        self.min_length = min_length

    #去除标点符号
    def replace_punctutation(self,word):
        #定义中文标点符号
        punctutations = ['【','】','《','》','：','，','（','）','、','！','？','“','”',"。",".","/","%","；"]
        #去除中文标点符号
        for i in punctutations:
            word.replace(i, "")
        #去除英文标点符号
        for j in string.punctuation:
            word.replace(i, "")
        return word

    #打开文件，读取文字内容
    def __open_split_file(self, path):
        #合并所有txt文件内容
        file_path = self.join_txt(path)
        file_words = self.open_file(file_path)
        return self.__seg_words(file_words)#调用__seg_words方法

    #打开文件
    def open_file(self,path):
        file_words = []
        with open(path,'r', encoding='gbk') as f:
            for i in f.readlines():#逐行读入到file_words中
                i = ''.join(i.split())
                file_words.append(i)
        return file_words

    #进行分词
    def __seg_words(self,file_words):
        seg_lists = []
        for i in file_words:
            seg_list = jieba.cut(i,cut_all = self.cut_all)#进行分词，cut_all=True的话分词更精细一些
            seg_lists.extend(seg_list)
            #遍历每一行
        results = []
        for result in seg_lists:
            result = self.replace_punctutation(result)#去除标点符号
            if self.stop_words:
                result = self.delet_words(result)  #删除指定词
            results.append(result)
        if "" in results:
            results.remove('')#去除空值

        return results
    def delet_words(self,word):
        for i in self.stop_words:  #停用词语
            if i in word:
                word = "的"
        return word

    #统计词频
    def __count_words(self,results):
        counter = Counter()#初始化一个计数器
        for word in results:
            if len(word) >= self.min_length and word != '\n':
                counter[word] += 1
        return counter

    def word_cloud_and_count_words(self):
        """
        生成词云图片
        统计词频
        """
        words = self.__open_split_file(self.path)#调用open_split_file方法，进行分词
        #统计词频
        counter = self.__count_words(words)#调用count_words方法
        text = ' '.join(words)
        wordcloud = WordCloud(          #初始化词云对象
            background_color = self.background_color,
            width = self.width,
            height= self.height,
            margin = 2,
            max_words = self.max_words,
            font_path = self.font_path,
            random_state= 100)  #设置随机生成多少种配色方案
        #调用生成词云图方法
        word_cloud = wordcloud.generate(text)
        #转换成图片
        image = word_cloud.to_image()
        #保存图片
        image.save(self.save_path)
        return counter,image

    def join_txt(self,path):
        '''
        合并所有的txt文件并写入到一个文件中
        '''
        #找出当前目录下所有的文件
        file_names = os.listdir(path)
        file = open(os.path.join(path,'results.txt'), 'w')
        for file_name in file_names:
            if ".txt" not in file_name: #如果后缀不是.txt,直接结束当前迭代，进入下一次迭代
                continue
            else:
                file_path = os.path.join(path,file_name)
                #遍历单个文件，读取每行内容
                for line in open(file_path, encoding='utf-8'):
                    file.writelines(line)
                file.write('\n') #每个txt文件的内容用回车键隔开
        file.close()
        #results文件的绝对路径
        path = os.path.join(path,'results.txt')
        return path

if __name__ == "__main__":
    #参数
    args = {
        "path" : "D:\\pycharm\\basic\\wordCloud\\广东省人民政府关于印发广东省推进农业供给侧结构性改革实施方案的通知",  # 文本所在的文件夹相对路径
        "save_path": "D:\\pycharm\\basic\\wordCloud\\广东省人民政府关于印发广东省推进农业供给侧结构性改革实施方案的通知\\词云图.png", #词云图文件保存路径
        "background_color" : "white", #词云背景图颜色，默认是白色
        "font_path" : "simhei.ttf", #字体，默认是黑体
        "cut_all" : False, #是否全分，默认是True
        "width" : 3000,      #词云图的宽度
        "height" : 5000,     #词云图的高度
        "max_words" : 100,#最多显示多少词
        "min_length":2,#词语最短长度
        "stop_words":None #停用词语，直接往里添加list
    }
    word_cloud = Wordcloud(**args)
    counter, image = word_cloud.word_cloud_and_count_words()
    print(counter)
    image.show()

